###############################################################################
###############################################################################
## Authors: Taylor Damann, Dahjin Kim, Margit Tavits ##########################
## Paper: Women and Men Politicians' Response to War: Evidence from Ukraine ###
## Replication code for merging & creating dataset for analyses ###############
###############################################################################
###############################################################################



rm(list=rm())
library(dplyr)
library(SnowballC)
library(tm)
library(wordcloud)
library(textstem)
library(quanteda)
library(syuzhet)
library(plm)

### load data
# setwd()
# load("prep_raw.Rdata")
  
########################################################
########## REDACTED DUE TO CROWDTANGLE POLICY ##########
########################################################
### pre-process text
# text <- VectorSource(data$eng)
# text <- Corpus(text)
# text_clean<-tm_map(text, content_transformer(tolower))
# text_clean <- tm_map(text_clean, removePunctuation)
# text_clean <- tm_map(text_clean, removeNumbers)
# text_clean <- tm_map(text_clean, stripWhitespace)
# myStopwords <- c(stopwords('english'), "a", "b") 
# myStopwords <- setdiff(myStopwords, c("d", "e")) 
# text_clean <- tm_map(text_clean, removeWords, myStopwords)
# text_lemma <- tm_map(text_clean, lemmatize_strings)
# text_plain <- tm_map(text_lemma, PlainTextDocument)
# text <- c(unlist(text_plain$content), stringsAsFactors=F)
# data$eng_clean <- text_plain[[1]]$content


### perform sentiment analysis
# data <- data %>% mutate(zet = get_sentiment(eng_clean, method = "syuzhet"),
#                         nrc = get_sentiment(eng_clean, method = "nrc"),
#                         bing = get_sentiment(eng_clean, method = "bing"),
#                         afinn = get_sentiment(eng_clean, method = "afinn"))


### factorize variables, create ITS slopes, invasion cutoffs
# data <- data %>%
#   mutate(woman = as.factor(woman),
#          daysince = as.numeric(date - as.Date("2022-02-24")),
#          daysince1 = as.numeric(daysince + 116),
#          daysince2 = ifelse(daysince <0, 0, daysince+1),
#          invasion = as.factor(ifelse(daysince2 < 1, 0, 1)),
#          office = relevel(as.factor(office), ref="chairman_regional_council")) %>%
#          group_by(id, date) %>% mutate(react_m=mean(totalreactions)) %>% ungroup()


### check data
# class(data$woman) # factor
# class(data$invasion) # factor
# unique(data$date[data$daysince2==1]) # 2022-02-24
# min(data$date[data$invasion==1]) # 2022-02-24
# min(data$daysince1) # should be 1
# unique(data$posttype) # 9 levels, base = Status
# n_distinct(data$id) # should be 469



###############################
########## AVAILABLE ##########
###############################
load("prep_aggregate.Rdata")

### call Top2Vec results
topics <- read.csv("DocIDTopic.csv")
topics <- topics %>% arrange(document_ids)


### merge
data$label <- topics$label
data$topic <- topics$assigned_topic
data$label <- relevel(factor(data$label, levels = c("aid", "pride", "sec", "for", "leg", "rel", "rus", "other")), ref="other")


### merge politician committee
committee <- read.csv("committee.csv")

### combine duplicate names
committee$committee <- gsub("Law Enforcement Activities", "Law Enforcement", 
                            gsub("Organization of State Power, Local Self-Government, Regional Development and Urban Planning", 
                            "State Power, Local Self-Government, Regional Development and Urban Planning", x=committee$committee))

### second committee assignment
committee$committee2 <- NA
committee$committee2[5] <- committee$committee[6]
committee <- committee[-6, ]

### join with FB data
data <- left_join(data, select(committee, committee, committee2, id), by="id",
                  relationship = "many-to-many")
data <- data %>% mutate(committee= factor(ifelse(is.na(committee), "None", committee)),
                        committee= relevel(committee, ref="None")) 

### merge region data
mayors <- read.csv("UkraineMayors.csv") # city + region
cabinet <- read.csv("UkraineCabinet.csv") # no region
mps <- read.csv("UkraineMPsOfficial.csv") # district
governors <- read.csv("UkraineGovernors.csv") # region - Oblast
chairmen <- read.csv("UkraineRCChairmen.csv") # region - Oblast

### merge with full data
data <- data %>% left_join(select(mps, id, district), by="id") %>%
  mutate(district = coalesce(district.y, district.x)) %>%
  select(-district.x, -district.y) %>%
  left_join(select(mayors, id, city, region), by="id") %>%
  mutate(region = coalesce(region.y, region.x),
         city = coalesce(region.y, region.x)) %>%
  select(-region.x, -region.y, -city.x, -city.y) %>%
  left_join(select(governors, id, region), by="id") %>%
  mutate(region = coalesce(region.y, region.x)) %>%
  select(-region.x, -region.y) %>%
  left_join(select(chairmen, id, region), by="id") %>%
  mutate(region = coalesce(region.y, region.x)) %>%
  select(-region.x, -region.y)

### clean/translate region names into English
data <- data %>%
  mutate(region = gsub(" область", "",
                  gsub("Тернопільська", "Ternopil",
                  gsub("Запорізька", "Zaporizhzhia",
                  gsub("Чернігівська", "Chernihiv",
                  gsub("Київська", "Kyiv", 
                  gsub("Миколаївська", "Mykolaiv", 
                  gsub("Харківська", "Kharkiv", 
                  gsub("Дніпропетровська", "Dnipropetrovsk", 
                  gsub("Сумська", "Sumy", 
                  gsub("Львівська", "Lviv", 
                  gsub("Вінницька", "Vinnytsia", 
                  gsub("Одеська", "Odesa", 
                  gsub("Закарпатська", "Zakarpattia", 
                  gsub("м. Київ|місто Київ", "m. Kyiv", # can't detect, but ok
                  gsub("Донецька", "Donetsk", 
                  gsub("Рівненська", "Rivne", 
                  gsub("Хмельницька", "Khmelnytskyi", 
                  gsub("Херсонська", "Kherson", 
                  gsub("Черкаська", "Cherkasy", 
                  gsub("Волинська", "Volyn", 
                  gsub("Чернівецька", "Chernivtsi", 
                  gsub("Полтавська", "Poltava", 
                  gsub("Житомирська", "Zhytomyr", 
                  gsub("Кіровоградська", "Kirovohrad", 
                  gsub("Івано-Франківська", "Ivano-Frankivsk", 
                  gsub("Луганська", "Luhansk", data$region)))))))))))))))))))))))))))

### assign region for MPs, based on district
data <- data %>% 
  mutate(region = ifelse(district %in% c(11:18), "Vinnytsia", 
                  ifelse(district %in% c(19:23), "Volyn", 
                  ifelse(district %in% c(24:40), "Dnipropetrovsk", 
                  ifelse(district %in% c(41:61), "Donetsk",
                  ifelse(district %in% c(62:67), "Zhytomyr",
                  ifelse(district %in% c(68:73), "Zakarpattia", 
                  ifelse(district %in% c(74:82), "Dnipropetrovsk", 
                  ifelse(district %in% c(83:89), "Ivano-Frankivsk", 
                  ifelse(district %in% c(90:98), "Kyiv",
                  ifelse(district %in% c(99:103), "Kirovohrad",
                  ifelse(district %in% c(104:114), "Luhansk", 
                  ifelse(district %in% c(115:126), "Lviv", 
                  ifelse(district %in% c(127:132), "Mykolaiv", 
                  ifelse(district %in% c(133:143), "Odesa", 
                  ifelse(district %in% c(144:151), "Poltava", 
                  ifelse(district %in% c(152:156), "Rivne", 
                  ifelse(district %in% c(157:162), "Sumy", 
                  ifelse(district %in% c(163:167), "Ternopil", 
                  ifelse(district %in% c(168:181), "Kharkiv", 
                  ifelse(district %in% c(182:186), "Kherson", 
                  ifelse(district %in% c(187:193), "Khmelnytskyi", 
                  ifelse(district %in% c(194:200), "Cherkasy", 
                  ifelse(district %in% c(201:210), "Chernihiv", 
                  ifelse(district %in% c(211:223), "KyivCity", region) # letter not recognized
                  ))))))))))))))))))))))))

### combine "м. Київ" and "KyivCity"
data$region[grepl("\\.", data$region)] <- "KyivCity"
data <- data %>% mutate(region = relevel(factor(ifelse(is.na(region), "None", region)), ref="None"))

### create larger regions (east//west/south/central)
data <- data %>% 
  mutate(region2 = ifelse(region %in% c("Ternopil", "Lviv", "Vinnytsia", "Zakarpattia", "Rivne", "Khmelnytskyi", "Volyn", "	Zhytomyr", "Ivano-Frankivsk"), "Western",
                          ifelse(region %in% c("Zaporizhzhia", "Kharkiv", "Dnipropetrovsk", "Donetsk", "Luhansk"), "Eastern",
                                 ifelse(region %in% c("Mykolaiv", "Odesa", "Kherson", "Chernivtsi"), "Southern",
                                        ifelse(region %in% c("Chernihiv", "KyivCity", "Kyiv", "Sumy", "Cherkasy", "Poltava", "Kirovohrad"), "Central",
                       NA)))))


### check variables
unique(data$committee) # 25 levels, base = None
unique(data$office) # 5 levels, base = chairman_regional_council
unique(data$region) # 26 levels, base = None


###########################################
########## UNIT: Politician-Day ###########
###########################################
# create a full running time variable
polday <- expand.grid(id = unique(data$id),
                      date = seq(as.Date("2021-11-01"), as.Date("2022-06-01"), by="days"))
polday <- polday %>% group_by(id) %>% mutate(daysince1=seq(1, 213, by=1))

politician <- data %>% group_by(id) %>%
  summarize(office=office,
            woman=woman,
            committee=committee,
            region=region) %>% distinct()

panel <- data %>% group_by(id, daysince1) %>%
  summarize(num.post = n(),
            zet_m = mean(zet),
            nrc_m = mean(nrc),
            afinn_m = mean(afinn),
            bing_m = mean(bing),
            aid = sum(label=="aid")/num.post,
            boost = sum(label=="pride")/num.post,
            sec = sum(label=="sec")/num.post,
            react_m = react_m) %>% distinct()
# merge
polday <- polday %>% left_join(politician, by = c("id"))
polday <- polday %>% left_join(panel, by = c("id", "daysince1"))

# fill NA with 0 for num.post column
# create running time variables and invasion cutoffs
polday <- polday %>% 
  mutate(num.post = ifelse(is.na(num.post), 0, num.post),
         daysince2 = ifelse(date-as.Date("2022-02-23") < 0, 0, date-as.Date("2022-02-23")),
         invasion = as.factor(ifelse(daysince2 < 1, 0, 1))) %>%
  group_by(id) %>% mutate(daysince1 = seq.int(n()))

save(polday, file="PoliticianDayUnit.Rdata")



#################################
########## UNIT: Post ###########
#################################

### create an index variable
post <- data %>% group_by(id, daysince) %>% 
  mutate(seq = seq.int(n()),
         dayseq = paste0(daysince,".",seq))

save(post, file="PostUnit.Rdata")

###############################################################################
###############################################################################